Raw data

- Study summary: Neurological complications are common in patients with COVID-19. While SARS-CoV-2, the causal pathogen of COVID-19, has been detected in some patient brains, its ability to infect brain cells and impact their function are not well understood, and experimental models using human brain cells are urgently needed. Here we investigated the susceptibility of human induced pluripotent stem cell (hiPSC)-derived monolayer brain cells and region-specific brain organoids to SARS-CoV-2 infection. We found modest numbers of infected neurons and astrocytes, but greater infection of choroid plexus epithelial cells. We optimized a protocol to generate choroid plexus organoids from hiPSCs, which revealed productive SARS-CoV-2 infection that leads to increased cell death and transcriptional dysregulation indicative of an inflammatory response and cellular function deficits. Together, our results provide evidence for SARS-CoV-2 neurotropism and support use of hiPSC-derived brain organoids as a platform to investigate the cellular susceptibility, disease mechanisms, and treatment strategies for SARS-CoV-2 infection. Bulk RNA-seq of choroid plexus organoids (CPOs) was performed on mock 72 hours post-infection (hpi), SARS-CoV-2 24 hpi, and SARS-CoV-2 72 hpi samples. All conditions were profiled in triplicate.

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)

Setting AnnotationHub

DB <- "EnsDb"                        # Set your DB of interest
AnnotationSpecies <- "Homo sapiens"  # Set your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))  # Bring annotation DB

Running AnnotationHub

# Filter annotation of interest
ahQuery <- query(ah, 
                 pattern=c(DB, AnnotationSpecies), 
                 ignore.case=T)      


# Select the most recent data
DBName <- mcols(ahQuery) %>%
    rownames() %>%
    tail(1)

AnnoDb <- ah[[DBName]] 

# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="TXID",
                 columns=c("GENEID", "GENENAME")) 


# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)    # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
##              TXID          GENEID GENENAME
## 1 ENST00000000233 ENSG00000004059     ARF5
## 2 ENST00000000412 ENSG00000003056     M6PR
## 3 ENST00000000442 ENSG00000173153    ESRRA
## 4 ENST00000001008 ENSG00000004478    FKBP4
## 5 ENST00000001146 ENSG00000003137  CYP26B1
## 6 ENST00000002125 ENSG00000003509  NDUFAF7

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
#
# Define sample names 
SampleNames <-  c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3)) 

# Define group level
GroupLevel <- c("Mock", "Covid")

# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)


# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC

# Define .sf file path
sf <- c(paste0("../salmon_output/", 
               SampleNames,
               ".salmon_quant/quant.sf"))

# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)

rownames(metadata) <- SampleNames

# Explore the metadata
print(metadata)
##                          Sample Group
## Mock-rep1             Mock-rep1  Mock
## Mock-rep2             Mock-rep2  Mock
## Mock-rep3             Mock-rep3  Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
##                                                                   Path
## Mock-rep1             ../salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2             ../salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3             ../salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 ../salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 ../salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 ../salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames

# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 

txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

# tpm 
head(txi_tpm$counts)
##                   Mock-rep1    Mock-rep2 Mock-rep3 SARS-CoV-2-rep1
## ENSG00000000003 9517.216266 7376.0754350 7413.2792    10930.427303
## ENSG00000000005    7.046572    3.9313942    0.0000        2.004902
## ENSG00000000419  878.258176  916.2463792  728.9124     1201.916522
## ENSG00000000457  711.679321  553.5118176  561.1932      777.339592
## ENSG00000000460  465.051715  352.6855963  391.7691      731.425995
## ENSG00000000938    0.000000    0.9945102    0.0000        0.000000
##                 SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000000003    6030.3241392     7835.437604
## ENSG00000000005       0.9863951        4.002353
## ENSG00000000419     783.6099539      991.786871
## ENSG00000000457     459.4668351      603.497683
## ENSG00000000460     377.0007259      536.931827
## ENSG00000000938       0.0000000        0.000000
dim(txi_tpm$counts)
## [1] 60217     6
# counts
head(txi_counts$counts)
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003  8991.942  7660.942  7504.000       11037.869        6193.989
## ENSG00000000005     7.000     4.000     0.000           2.000           1.000
## ENSG00000000419   889.000   928.001   730.000        1211.001         777.000
## ENSG00000000457   699.297   564.374   566.745         832.783         437.987
## ENSG00000000460   452.703   366.157   397.262         740.218         388.013
## ENSG00000000938     0.000     1.000     0.000           0.000           0.000
##                 SARS-CoV-2-rep3
## ENSG00000000003        7772.929
## ENSG00000000005           4.000
## ENSG00000000419         991.000
## ENSG00000000457         596.733
## ENSG00000000460         514.268
## ENSG00000000938           0.000
dim(txi_counts$counts)
## [1] 60217     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 

    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)

    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)

    # Output them as a list 
    return(list(dds=des, vsd=ves))

}

TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 60217 6 
## metadata(1): version
## assays(1): counts
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
##   ENSG00000288696
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003      9517      7376      7413           10930            6030
## ENSG00000000005         7         4         0               2               1
## ENSG00000000419       878       916       729            1202             784
## ENSG00000000457       712       554       561             777             459
## ENSG00000000460       465       353       392             731             377
## ENSG00000000938         0         1         0               0               0
##                 SARS-CoV-2-rep3
## ENSG00000000003            7835
## ENSG00000000005               4
## ENSG00000000419             992
## ENSG00000000457             603
## ENSG00000000460             537
## ENSG00000000938               0
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 60217 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
##   ENSG00000288696
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003      8992      7661      7504           11038            6194
## ENSG00000000005         7         4         0               2               1
## ENSG00000000419       889       928       730            1211             777
## ENSG00000000457       699       564       567             833             438
## ENSG00000000460       453       366       397             740             388
## ENSG00000000938         0         1         0               0               0
##                 SARS-CoV-2-rep3
## ENSG00000000003            7773
## ENSG00000000005               4
## ENSG00000000419             991
## ENSG00000000457             597
## ENSG00000000460             514
## ENSG00000000938               0

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 60217 6 
## metadata(1): version
## assays(1): ''
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
##   ENSG00000288696
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 60217 6 
## metadata(1): version
## assays(1): ''
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
##   ENSG00000288696
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run: using ‘avgTxLength’ from assays(dds)

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##       Mock-rep1       Mock-rep2       Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2 
##       1.0962032       1.0446805       0.8945333       1.3649221       0.7478378 
## SARS-CoV-2-rep3 
##       1.0187286
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##                          Sample    Group                   Path
##                        <factor> <factor>            <character>
## Mock-rep1       Mock-rep1          Mock  ../salmon_output/Moc..
## Mock-rep2       Mock-rep2          Mock  ../salmon_output/Moc..
## Mock-rep3       Mock-rep3          Mock  ../salmon_output/Moc..
## SARS-CoV-2-rep1 SARS-CoV-2-rep1    Covid ../salmon_output/SAR..
## SARS-CoV-2-rep2 SARS-CoV-2-rep2    Covid ../salmon_output/SAR..
## SARS-CoV-2-rep3 SARS-CoV-2-rep3    Covid ../salmon_output/SAR..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##                          Sample    Group                   Path sizeFactor
##                        <factor> <factor>            <character>  <numeric>
## Mock-rep1       Mock-rep1          Mock  ../salmon_output/Moc..   1.096203
## Mock-rep2       Mock-rep2          Mock  ../salmon_output/Moc..   1.044680
## Mock-rep3       Mock-rep3          Mock  ../salmon_output/Moc..   0.894533
## SARS-CoV-2-rep1 SARS-CoV-2-rep1    Covid ../salmon_output/SAR..   1.364922
## SARS-CoV-2-rep2 SARS-CoV-2-rep2    Covid ../salmon_output/SAR..   0.747838
## SARS-CoV-2-rep3 SARS-CoV-2-rep3    Covid ../salmon_output/SAR..   1.018729

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))

colnames(sizeFactor) <- 'Size_Factor'

sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 

sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)



# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.5, 1.5)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 

# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {

    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}

# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]

# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {

    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd

    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis (PCA)

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis with or without shrinkage

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]]) 
names(desList) <- TvC

# Create a list for TPM and Counts dds 
ddsList <- desList  # DE without shrinkage  
normal.ddsList <- desList    # DE with "normal" shrinkage
ape.ddsList <- desList       # DE with "apeglm" shrinkage
ash.ddsList <- desList       # DE with "ashr" shrinkage

for (x in TvC) {
    
    # Run DESeq() 
    ddsList[[x]] <- DESeq(desList[[x]])
    print(resultsNames(ddsList[[x]]))

    normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                                contrast=Contrast,
                                type="normal")

    ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="apeglm")

    ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="ashr")

}
## [1] "Intercept"           "Group_Covid_vs_Mock"
## [1] "Intercept"           "Group_Covid_vs_Mock"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {

    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Extracting DE results with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 

# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 

# Initialize lists of result tables 
all.resList <- all.ddsList 

# Set a function cleaning table
Sig.fn <- function(df, Input) {
    
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}




for (i in shrinkage) {

    if (i == "None") {

        for (x in TvC) {

        # Extract data frames from unshrunken lfc & clean data 
        all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]], 
                                                       contrast=Contrast, 
                                                       alpha=alpha)) %>% 
        Sig.fn(x)

         } 
    } else {

        # Extract data frames from shrunken lfc & clean data
        for (x in TvC) {

            all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
                Sig.fn(x)
    

        }
    }
}





# Explore the results 
summary(all.resList)
##        Length Class  Mode
## None   2      -none- list
## Normal 2      -none- list
## Apeglm 2      -none- list
## Ashr   2      -none- list
head(all.resList[[1]][['TPM']])
##              Gene     baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 7965.2177921     0.01615004 0.1264025  0.1277667 0.89833358
## 2 ENSG00000000005    2.8239226     0.61324799 1.5975442  0.3838692 0.70107541
## 3 ENSG00000000419  899.2457234    -0.21790393 0.1438633 -1.5146597 0.12985862
## 4 ENSG00000000457  596.9849855     0.02668507 0.1574980  0.1694312 0.86545751
## 5 ENSG00000000460  461.1867396    -0.38591640 0.1724174 -2.2382680 0.02520359
## 6 ENSG00000000938    0.1595384     0.96899351 4.0804729  0.2374709 0.81229150
##        padj   FDR Input
## 1 0.9647431 > 0.1   TPM
## 2        NA > 0.1   TPM
## 3 0.3739816 > 0.1   TPM
## 4 0.9523724 > 0.1   TPM
## 5 0.1284334 > 0.1   TPM
## 6        NA > 0.1   TPM
head(all.resList[[1]][['Counts']])
##              Gene    baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 8079.086540     0.01545565 0.1269911  0.1217065 0.90313144
## 2 ENSG00000000005    2.859546     0.61242873 1.5819550  0.3871341 0.69865693
## 3 ENSG00000000419  912.475607    -0.21780383 0.1444052 -1.5082829 0.13148215
## 4 ENSG00000000457  605.612525     0.02396882 0.1577268  0.1519642 0.87921520
## 5 ENSG00000000460  467.640244    -0.38737335 0.1721631 -2.2500366 0.02444662
## 6 ENSG00000000938    0.161090     0.96756753 4.0804729  0.2371214 0.81256259
##        padj   FDR  Input
## 1 0.9671255 > 0.1 Counts
## 2        NA > 0.1 Counts
## 3 0.3778008 > 0.1 Counts
## 4 0.9576855 > 0.1 Counts
## 5 0.1257650 > 0.1 Counts
## 6        NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
##              Gene     baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 7965.2177921     0.01575077 0.1232768  0.1277667 0.89833358
## 2 ENSG00000000005    2.8239226     0.12156188 0.3170565  0.3838692 0.70107541
## 3 ENSG00000000419  899.2457234    -0.21097097 0.1392877 -1.5146597 0.12985862
## 4 ENSG00000000457  596.9849855     0.02567417 0.1515329  0.1694312 0.86545751
## 5 ENSG00000000460  461.1867396    -0.36853131 0.1646435 -2.2382680 0.02520359
## 6 ENSG00000000938    0.1595384     0.03533431 0.1487968  0.2374709 0.81229150
##        padj   FDR Input
## 1 0.9647431 > 0.1   TPM
## 2        NA > 0.1   TPM
## 3 0.3739816 > 0.1   TPM
## 4 0.9523724 > 0.1   TPM
## 5 0.1284334 > 0.1   TPM
## 6        NA > 0.1   TPM
head(all.resList[[2]][['Counts']])
##              Gene    baseMean log2FoldChange     lfcSE       stat     pvalue
## 1 ENSG00000000003 8079.086540     0.01506713 0.1237982  0.1217065 0.90313144
## 2 ENSG00000000005    2.859546     0.12255440 0.3169599  0.3871341 0.69865693
## 3 ENSG00000000419  912.475607    -0.21077061 0.1397439 -1.5082829 0.13148215
## 4 ENSG00000000457  605.612525     0.02305135 0.1516914  0.1519642 0.87921520
## 5 ENSG00000000460  467.640244    -0.36984015 0.1643650 -2.2500366 0.02444662
## 6 ENSG00000000938    0.161090     0.03501887 0.1476858  0.2371214 0.81256259
##        padj   FDR  Input
## 1 0.9671255 > 0.1 Counts
## 2        NA > 0.1 Counts
## 3 0.3778008 > 0.1 Counts
## 4 0.9576855 > 0.1 Counts
## 5 0.1257650 > 0.1 Counts
## 6        NA > 0.1 Counts

Exploring mean-difference relationship with MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-25, 25)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Initialize a list storing MA plots
MAList <- ddsList


# Create MA plots

for (i in shrinkage) {

    both.df <- rbind(all.resList[[i]][[TvC[1]]], 
                     all.resList[[i]][[TvC[2]]])

    MAList[[i]] <- ggplot(both.df, 
                              aes(x=baseMean, y=log2FoldChange, color=FDR))  +geom_point() + scale_x_log10() + facet_grid(~Input) + 
                                   theme_bw() + 
                                   scale_color_manual(values=c("blue", "grey")) +
                                   ggtitle(paste("MA plot with", i)) +
                                   ylim(yl[1], yl[2]) + 
                                   geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red") 

}

   

# Print MA plots
MAList
## $TPM
## class: DESeqDataSet 
## dim: 60217 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
##   ENSG00000288696
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
## 
## $Counts
## class: DESeqDataSet 
## dim: 60217 6 
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
##   ENSG00000288696
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
## 
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = alpha

# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']], 
                 all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)

# plot distribution of fdr
ggplot(both.df,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("distribution of false discovery rate (fdr)") + 
xlab("adjusted p-value (fdr)") + 
ylab("count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Initialize a lfc list
lfcplotList <- all.resList 

# Create lfc distribution plots
for (i in shrinkage) {

    lfc.df <- rbind(all.resList[[i]][[TvC[1]]], 
                    all.resList[[i]][[TvC[2]]])

    lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]

    lfc.df$Input <- factor(lfc.df$Input, levels=TvC)

    lfcplotList[[i]] <- ggplot(lfc.df,  # Subset rows with FDR < alpha
                               aes(x=log2FoldChange, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() + ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black", 
           linetype="dashed", 
           size=1) + 
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) + 
xlim(-5, 5)


}

# Print the lfc plots
lfcplotList
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 


# Create a data frame storing NA gene number
NAstat <- both.df %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))

# Plot number of NA genes 
ggplot(NAstat, 
       aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

baseMean/LFC/FDR comparison between TPM and Count inputs

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(GENEID) %>% 
    summarize(num.trans=n_distinct(TXID))

# Create an empty list storing significant gene lfc tables 
sigList <- list() 


# Filter significant genes' lfc and save in the list 
for (x in shrinkage) {

    for (y in TvC) {

        # Subset genes with FDR below alpha
        sigList[[x]][[y]] <- subset(all.resList[[x]][[y]], 
                                    FDR == paste("<", alpha))

        # Explore the output 
        print(head(sigList[[x]][[y]]))
        print(dim(sigList[[x]][[y]]))

    }
}
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 9  ENSG00000001084 1579.91666      0.5260867 0.1525682  3.448207 5.643208e-04
## 14 ENSG00000001561 2231.51812      0.6038548 0.1471349  4.104090 4.059087e-05
## 18 ENSG00000001630 1971.01636     -0.4025373 0.1207053 -3.334877 8.533709e-04
## 30 ENSG00000002834 1072.81969     -0.6600582 0.1426174 -4.628176 3.689008e-06
## 38 ENSG00000003393 2974.86603     -0.3422075 0.1210011 -2.828135 4.682005e-03
## 40 ENSG00000003402   96.37838     -1.8291764 0.4290495 -4.263323 2.014093e-05
##            padj   FDR Input
## 9  6.833483e-03 < 0.1   TPM
## 14 7.692425e-04 < 0.1   TPM
## 18 9.584329e-03 < 0.1   TPM
## 30 9.655758e-05 < 0.1   TPM
## 38 3.681661e-02 < 0.1   TPM
## 40 4.164696e-04 < 0.1   TPM
## [1] 3342    9
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 9  ENSG00000001084 1599.77911      0.5255233 0.1526996  3.441550 5.783914e-04
## 14 ENSG00000001561 2264.37496      0.6032209 0.1470826  4.101239 4.109442e-05
## 18 ENSG00000001630 1999.73142     -0.4028765 0.1217931 -3.307877 9.400612e-04
## 30 ENSG00000002834 1088.29573     -0.6611145 0.1430768 -4.620696 3.824545e-06
## 38 ENSG00000003393 3018.81754     -0.3427407 0.1220640 -2.807878 4.986904e-03
## 40 ENSG00000003402   89.65349     -1.8179235 0.4271838 -4.255600 2.084889e-05
##            padj   FDR  Input
## 9  7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349    9
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 9  ENSG00000001084 1579.91666      0.5073449 0.1471299  3.448207 5.643208e-04
## 14 ENSG00000001561 2231.51812      0.5838044 0.1422449  4.104090 4.059087e-05
## 18 ENSG00000001630 1971.01636     -0.3934405 0.1179763 -3.334877 8.533709e-04
## 30 ENSG00000002834 1072.81969     -0.6394221 0.1381508 -4.628176 3.689008e-06
## 38 ENSG00000003393 2974.86603     -0.3344360 0.1182530 -2.828135 4.682005e-03
## 40 ENSG00000003402   96.37838     -1.4142977 0.3311581 -4.263323 2.014093e-05
##            padj   FDR Input
## 9  6.833483e-03 < 0.1   TPM
## 14 7.692425e-04 < 0.1   TPM
## 18 9.584329e-03 < 0.1   TPM
## 30 9.655758e-05 < 0.1   TPM
## 38 3.681661e-02 < 0.1   TPM
## 40 4.164696e-04 < 0.1   TPM
## [1] 3342    9
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 9  ENSG00000001084 1599.77911      0.5066282 0.1472061  3.441550 5.783914e-04
## 14 ENSG00000001561 2264.37496      0.5830545 0.1421609  4.101239 4.109442e-05
## 18 ENSG00000001630 1999.73142     -0.3935402 0.1189695 -3.307877 9.400612e-04
## 30 ENSG00000002834 1088.29573     -0.6401589 0.1385337 -4.620696 3.824545e-06
## 38 ENSG00000003393 3018.81754     -0.3347627 0.1192225 -2.807878 4.986904e-03
## 40 ENSG00000003402   89.65349     -1.4048001 0.3303655 -4.255600 2.084889e-05
##            padj   FDR  Input
## 9  7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349    9
##               Gene   baseMean log2FoldChange     lfcSE       pvalue
## 9  ENSG00000001084 1579.91666     -0.4726981 0.1532595 5.643208e-04
## 14 ENSG00000001561 2231.51812     -0.5555317 0.1486431 4.059087e-05
## 18 ENSG00000001630 1971.01636      0.3698951 0.1198304 8.533709e-04
## 30 ENSG00000002834 1072.81969      0.6164075 0.1443250 3.689008e-06
## 38 ENSG00000003393 2974.86603      0.3107452 0.1190360 4.682005e-03
## 40 ENSG00000003402   96.37838      1.6180973 0.4563044 2.014093e-05
##            padj   FDR Input
## 9  6.833483e-03 < 0.1   TPM
## 14 7.692425e-04 < 0.1   TPM
## 18 9.584329e-03 < 0.1   TPM
## 30 9.655758e-05 < 0.1   TPM
## 38 3.681661e-02 < 0.1   TPM
## 40 4.164696e-04 < 0.1   TPM
## [1] 3342    8
##               Gene   baseMean log2FoldChange     lfcSE       pvalue
## 9  ENSG00000001084 1599.77911     -0.4720725 0.1533639 5.783914e-04
## 14 ENSG00000001561 2264.37496     -0.5550610 0.1485718 4.109442e-05
## 18 ENSG00000001630 1999.73142      0.3696152 0.1208830 9.400612e-04
## 30 ENSG00000002834 1088.29573      0.6172558 0.1447949 3.824545e-06
## 38 ENSG00000003393 3018.81754      0.3107198 0.1200109 4.986904e-03
## 40 ENSG00000003402   89.65349      1.6067050 0.4552665 2.084889e-05
##            padj   FDR  Input
## 9  7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349    8
##               Gene   baseMean log2FoldChange     lfcSE       pvalue
## 9  ENSG00000001084 1579.91666     -0.4256113 0.1526027 5.643208e-04
## 14 ENSG00000001561 2231.51812     -0.5122028 0.1506099 4.059087e-05
## 18 ENSG00000001630 1971.01636      0.3387335 0.1181537 8.533709e-04
## 30 ENSG00000002834 1072.81969      0.5771369 0.1469019 3.689008e-06
## 38 ENSG00000003393 2974.86603      0.2820018 0.1156657 4.682005e-03
## 40 ENSG00000003402   96.37838      1.4139227 0.5036537 2.014093e-05
##            padj   FDR Input
## 9  6.833483e-03 < 0.1   TPM
## 14 7.692425e-04 < 0.1   TPM
## 18 9.584329e-03 < 0.1   TPM
## 30 9.655758e-05 < 0.1   TPM
## 38 3.681661e-02 < 0.1   TPM
## 40 4.164696e-04 < 0.1   TPM
## [1] 3342    8
##               Gene   baseMean log2FoldChange     lfcSE       pvalue
## 9  ENSG00000001084 1599.77911     -0.4251008 0.1526704 5.783914e-04
## 14 ENSG00000001561 2264.37496     -0.5117762 0.1505215 4.109442e-05
## 18 ENSG00000001630 1999.73142      0.3382107 0.1191277 9.400612e-04
## 30 ENSG00000002834 1088.29573      0.5778473 0.1473885 3.824545e-06
## 38 ENSG00000003393 3018.81754      0.2817658 0.1165975 4.986904e-03
## 40 ENSG00000003402   89.65349      1.3993226 0.5011852 2.084889e-05
##            padj   FDR  Input
## 9  7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349    8
# Clean the data frames with renaming columns
for (x in shrinkage) {

    # Join TPM and Counts tables by GENEID
    df <- inner_join(sigList[[x]][[1]], 
                     sigList[[x]][[2]],
                     by="Gene") 

    # Create a vector storing original column nams 
    my.colname1 <- colnames(sigList[[x]][[1]])[-1]

    # Create a vector storing modified column names
    my.colname2 <- c("GENEID", 
                     paste0(my.colname1, "_", TvC[1]), 
                     paste0(my.colname1, "_", TvC[2]))

    # Rename the columns
    colnames(df) <- my.colname2

    # Add a column storing shrinkage method and drop redundant columns
    df <- df %>% 
        mutate(Shrinkage = x) %>% 
        dplyr::select(-starts_with(c("lfcSE", "stat", "FDR")))

    # Save the cleaned data frame in the list
    sigList[[x]] <- df

    # Explore the output data frame
    print(head(df))
    print(dim(df))
}
##            GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM     padj_TPM
## 1 ENSG00000001084   1579.91666          0.5260867 5.643208e-04 6.833483e-03
## 2 ENSG00000001561   2231.51812          0.6038548 4.059087e-05 7.692425e-04
## 3 ENSG00000001630   1971.01636         -0.4025373 8.533709e-04 9.584329e-03
## 4 ENSG00000002834   1072.81969         -0.6600582 3.689008e-06 9.655758e-05
## 5 ENSG00000003393   2974.86603         -0.3422075 4.682005e-03 3.681661e-02
## 6 ENSG00000003402     96.37838         -1.8291764 2.014093e-05 4.164696e-04
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts  padj_Counts
## 1       TPM      1599.77911             0.5255233  5.783914e-04 7.030493e-03
## 2       TPM      2264.37496             0.6032209  4.109442e-05 7.727078e-04
## 3       TPM      1999.73142            -0.4028765  9.400612e-04 1.036219e-02
## 4       TPM      1088.29573            -0.6611145  3.824545e-06 9.925706e-05
## 5       TPM      3018.81754            -0.3427407  4.986904e-03 3.894634e-02
## 6       TPM        89.65349            -1.8179235  2.084889e-05 4.273907e-04
##   Input_Counts Shrinkage
## 1       Counts      None
## 2       Counts      None
## 3       Counts      None
## 4       Counts      None
## 5       Counts      None
## 6       Counts      None
## [1] 3312   12
##            GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM     padj_TPM
## 1 ENSG00000001084   1579.91666          0.5073449 5.643208e-04 6.833483e-03
## 2 ENSG00000001561   2231.51812          0.5838044 4.059087e-05 7.692425e-04
## 3 ENSG00000001630   1971.01636         -0.3934405 8.533709e-04 9.584329e-03
## 4 ENSG00000002834   1072.81969         -0.6394221 3.689008e-06 9.655758e-05
## 5 ENSG00000003393   2974.86603         -0.3344360 4.682005e-03 3.681661e-02
## 6 ENSG00000003402     96.37838         -1.4142977 2.014093e-05 4.164696e-04
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts  padj_Counts
## 1       TPM      1599.77911             0.5066282  5.783914e-04 7.030493e-03
## 2       TPM      2264.37496             0.5830545  4.109442e-05 7.727078e-04
## 3       TPM      1999.73142            -0.3935402  9.400612e-04 1.036219e-02
## 4       TPM      1088.29573            -0.6401589  3.824545e-06 9.925706e-05
## 5       TPM      3018.81754            -0.3347627  4.986904e-03 3.894634e-02
## 6       TPM        89.65349            -1.4048001  2.084889e-05 4.273907e-04
##   Input_Counts Shrinkage
## 1       Counts    Normal
## 2       Counts    Normal
## 3       Counts    Normal
## 4       Counts    Normal
## 5       Counts    Normal
## 6       Counts    Normal
## [1] 3312   12
##            GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM     padj_TPM
## 1 ENSG00000001084   1579.91666         -0.4726981 5.643208e-04 6.833483e-03
## 2 ENSG00000001561   2231.51812         -0.5555317 4.059087e-05 7.692425e-04
## 3 ENSG00000001630   1971.01636          0.3698951 8.533709e-04 9.584329e-03
## 4 ENSG00000002834   1072.81969          0.6164075 3.689008e-06 9.655758e-05
## 5 ENSG00000003393   2974.86603          0.3107452 4.682005e-03 3.681661e-02
## 6 ENSG00000003402     96.37838          1.6180973 2.014093e-05 4.164696e-04
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts  padj_Counts
## 1       TPM      1599.77911            -0.4720725  5.783914e-04 7.030493e-03
## 2       TPM      2264.37496            -0.5550610  4.109442e-05 7.727078e-04
## 3       TPM      1999.73142             0.3696152  9.400612e-04 1.036219e-02
## 4       TPM      1088.29573             0.6172558  3.824545e-06 9.925706e-05
## 5       TPM      3018.81754             0.3107198  4.986904e-03 3.894634e-02
## 6       TPM        89.65349             1.6067050  2.084889e-05 4.273907e-04
##   Input_Counts Shrinkage
## 1       Counts    Apeglm
## 2       Counts    Apeglm
## 3       Counts    Apeglm
## 4       Counts    Apeglm
## 5       Counts    Apeglm
## 6       Counts    Apeglm
## [1] 3312   12
##            GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM     padj_TPM
## 1 ENSG00000001084   1579.91666         -0.4256113 5.643208e-04 6.833483e-03
## 2 ENSG00000001561   2231.51812         -0.5122028 4.059087e-05 7.692425e-04
## 3 ENSG00000001630   1971.01636          0.3387335 8.533709e-04 9.584329e-03
## 4 ENSG00000002834   1072.81969          0.5771369 3.689008e-06 9.655758e-05
## 5 ENSG00000003393   2974.86603          0.2820018 4.682005e-03 3.681661e-02
## 6 ENSG00000003402     96.37838          1.4139227 2.014093e-05 4.164696e-04
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts  padj_Counts
## 1       TPM      1599.77911            -0.4251008  5.783914e-04 7.030493e-03
## 2       TPM      2264.37496            -0.5117762  4.109442e-05 7.727078e-04
## 3       TPM      1999.73142             0.3382107  9.400612e-04 1.036219e-02
## 4       TPM      1088.29573             0.5778473  3.824545e-06 9.925706e-05
## 5       TPM      3018.81754             0.2817658  4.986904e-03 3.894634e-02
## 6       TPM        89.65349             1.3993226  2.084889e-05 4.273907e-04
##   Input_Counts Shrinkage
## 1       Counts      Ashr
## 2       Counts      Ashr
## 3       Counts      Ashr
## 4       Counts      Ashr
## 5       Counts      Ashr
## 6       Counts      Ashr
## [1] 3312   12
# Convert a list of data frames to one single data frame 
lfcTable <- sigList[[1]] 

for (i in 2:length(shrinkage)) {

    lfcTable <- rbind(lfcTable, sigList[[i]])

}

# Explore the output data frame
head(lfcTable)
##            GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM     padj_TPM
## 1 ENSG00000001084   1579.91666          0.5260867 5.643208e-04 6.833483e-03
## 2 ENSG00000001561   2231.51812          0.6038548 4.059087e-05 7.692425e-04
## 3 ENSG00000001630   1971.01636         -0.4025373 8.533709e-04 9.584329e-03
## 4 ENSG00000002834   1072.81969         -0.6600582 3.689008e-06 9.655758e-05
## 5 ENSG00000003393   2974.86603         -0.3422075 4.682005e-03 3.681661e-02
## 6 ENSG00000003402     96.37838         -1.8291764 2.014093e-05 4.164696e-04
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts  padj_Counts
## 1       TPM      1599.77911             0.5255233  5.783914e-04 7.030493e-03
## 2       TPM      2264.37496             0.6032209  4.109442e-05 7.727078e-04
## 3       TPM      1999.73142            -0.4028765  9.400612e-04 1.036219e-02
## 4       TPM      1088.29573            -0.6611145  3.824545e-06 9.925706e-05
## 5       TPM      3018.81754            -0.3427407  4.986904e-03 3.894634e-02
## 6       TPM        89.65349            -1.8179235  2.084889e-05 4.273907e-04
##   Input_Counts Shrinkage
## 1       Counts      None
## 2       Counts      None
## 3       Counts      None
## 4       Counts      None
## 5       Counts      None
## 6       Counts      None
dim(lfcTable)
## [1] 13248    12
# Calculate differences between TPM and Counts input data 
# in baseMean, log2FoldChange, and padj
lfcTable <- lfcTable %>%

    mutate(mean_TC=baseMean_TPM - baseMean_Counts,
           lfc_TC=log2FoldChange_TPM - log2FoldChange_Counts,
           FDR_TC=padj_TPM - padj_Counts) %>%

# Add a column storing the number of alternative transcripts
left_join(AnnoDb.ntrans, by="GENEID")

# Explore the output data frame
head(lfcTable)
##            GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM     padj_TPM
## 1 ENSG00000001084   1579.91666          0.5260867 5.643208e-04 6.833483e-03
## 2 ENSG00000001561   2231.51812          0.6038548 4.059087e-05 7.692425e-04
## 3 ENSG00000001630   1971.01636         -0.4025373 8.533709e-04 9.584329e-03
## 4 ENSG00000002834   1072.81969         -0.6600582 3.689008e-06 9.655758e-05
## 5 ENSG00000003393   2974.86603         -0.3422075 4.682005e-03 3.681661e-02
## 6 ENSG00000003402     96.37838         -1.8291764 2.014093e-05 4.164696e-04
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts  padj_Counts
## 1       TPM      1599.77911             0.5255233  5.783914e-04 7.030493e-03
## 2       TPM      2264.37496             0.6032209  4.109442e-05 7.727078e-04
## 3       TPM      1999.73142            -0.4028765  9.400612e-04 1.036219e-02
## 4       TPM      1088.29573            -0.6611145  3.824545e-06 9.925706e-05
## 5       TPM      3018.81754            -0.3427407  4.986904e-03 3.894634e-02
## 6       TPM        89.65349            -1.8179235  2.084889e-05 4.273907e-04
##   Input_Counts Shrinkage    mean_TC        lfc_TC        FDR_TC num.trans
## 1       Counts      None -19.862453  0.0005634285 -1.970095e-04        14
## 2       Counts      None -32.856840  0.0006339146 -3.465262e-06         1
## 3       Counts      None -28.715059  0.0003391573 -7.778599e-04         4
## 4       Counts      None -15.476039  0.0010562739 -2.699484e-06         9
## 5       Counts      None -43.951509  0.0005332544 -2.129730e-03        75
## 6       Counts      None   6.724896 -0.0112529139 -1.092112e-05        25
dim(lfcTable)
## [1] 13248    16
# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {

    vec <- c()

    for (i in 1:num) {
        vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
    }

    return(vec)

}
my.param <- c("baseMean", "log2FoldChange", "padj")


# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>% 
    dplyr::select(GENEID, num.trans, Shrinkage, starts_with(my.param)) %>%  
    gather(Category, Value, -GENEID, -num.trans, -Shrinkage) %>% 
    separate(Category, c("Metric", "Input"), sep="_") %>% 
    pivot_wider(names_from=Input, values_from=Value) %>%
    nest(-Metric, -Shrinkage)  

# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.comp$data, 
                          ~cor(.x$TPM, .x$Counts)),
                  7)

# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec, 
                         lfcTable.comp$data, 
                         length(corr.vec))


# Unnest the data frame
lfcTable.comp <- lfcTable.comp %>% 
    unnest(data) 

# Add a column storing Rsquared labels 
lfcTable.comp$Rsquared.label <- rsq.vec


# Explore the cleaned data frame
head(lfcTable.comp)
## # A tibble: 6 x 7
##   Shrinkage Metric   GENEID          num.trans    TPM Counts Rsquared.label
##   <chr>     <chr>    <chr>               <int>  <dbl>  <dbl> <chr>         
## 1 None      baseMean ENSG00000001084        14 1580.  1600.  "0.9999992"   
## 2 None      baseMean ENSG00000001561         1 2232.  2264.  ""            
## 3 None      baseMean ENSG00000001630         4 1971.  2000.  ""            
## 4 None      baseMean ENSG00000002834         9 1073.  1088.  ""            
## 5 None      baseMean ENSG00000003393        75 2975.  3019.  ""            
## 6 None      baseMean ENSG00000003402        25   96.4   89.7 ""
# Nest the data frame by metric
lfcTable.comp <- lfcTable.comp %>%
    nest(-Metric) 
# Set a function creating a scatter plot
comp.scatter.fn <- function(df, 
                            xvar, 
                            yvar, 
                            met, 
                            xlabel, 
                            ylabel) {

    p <- ggplot(df, aes(x=xvar, 
                        y=yvar, 
                        color=log(num.trans), 
                        label=Rsquared.label)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Comparison in", met, "\n(with R-squared)")) + 
        geom_text(size=5, 
                  mapping=aes(x=Inf, y=Inf), 
                  vjust=2, hjust=3.5, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") + 
scale_color_gradient(low="blue", high="red") +
facet_wrap(~Shrinkage, scales="free") + 
theme(strip.text.x=element_text(size=10)) +
xlab(paste("Input from", xlabel)) + 
ylab(paste("Input from", ylabel))


return(p)
}




# Print the plots
for (i in 1:nrow(lfcTable.comp)) {

    df <- lfcTable.comp$data[[i]]

    print(comp.scatter.fn(df,
                          df$TPM,
                          df$Counts,
                          lfcTable.comp$Metric[i], "TPM", "Counts"))
    
}

baseMean/LFC/FDR rank comparison between TPM and Count inputs

# Transform the data frame
lfcTable.rank <- lfcTable.comp %>% 
    unnest(data) %>%
    nest(-Metric, -Shrinkage)

# Clean and arrange the data frame
for (i in 1:length(lfcTable.rank$data)) {

    df <- lfcTable.rank$data[[i]] 

    # Create a vector storing rank (1 to the last)
    rank.vec <- 1:nrow(df)

    # Make descending order if the metric = "padj" and assigne rankings
    if (lfcTable.rank$Metric[[i]] == "padj") {

        df <- df %>%
            dplyr::arrange(TPM) %>% 
            mutate(rank.TPM=rank.vec) %>%
            dplyr::arrange(Counts) %>%
            mutate(rank.Counts=rank.vec) 

    # Make asending order otherwise and assign rankings
    } else {

        df <- df %>%
            dplyr::arrange(desc(abs(TPM))) %>%
            mutate(rank.TPM=rank.vec) %>% 
            dplyr::arrange(desc(abs(Counts))) %>%
            mutate(rank.Counts=rank.vec)

    }

    # Save the updated data frame 
    lfcTable.rank$data[[i]] <- df 



}

# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.rank$data, 
                          ~cor(.x$rank.TPM, .x$rank.Counts)),
                  7)

# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec, lfcTable.rank$data, length(corr.vec))

# Unnest the data frame
lfcTable.rank <- lfcTable.rank %>% 
    unnest(data) 

# Add a column storing Rsquared labels 
lfcTable.rank$Rsquared.label <- rsq.vec


# Explore the cleaned data frame
head(lfcTable.rank)
## # A tibble: 6 x 9
##   Metric  Shrinkage GENEID      num.trans     TPM Counts Rsquared.label rank.TPM
##   <chr>   <chr>     <chr>           <int>   <dbl>  <dbl> <chr>             <int>
## 1 baseMe… None      ENSG000001…         7  1.40e6 1.42e6 "0.999857"            1
## 2 baseMe… None      ENSG000002…         1  4.29e5 4.35e5 ""                    2
## 3 baseMe… None      ENSG000001…         1  2.89e5 2.93e5 ""                    3
## 4 baseMe… None      ENSG000001…        23  2.71e5 2.75e5 ""                    4
## 5 baseMe… None      ENSG000002…         4  2.02e5 2.05e5 ""                    5
## 6 baseMe… None      ENSG000001…        10  1.09e5 1.10e5 ""                    6
## # … with 1 more variable: rank.Counts <int>
# Nest the data frame by metric
lfcTable.rank <- lfcTable.rank %>%
    nest(-Metric) 

# Print the plots
for (i in 1:nrow(lfcTable.rank)) {

    df <- lfcTable.rank$data[[i]]

    pl <- comp.scatter.fn(df,
                          df$rank.TPM,
                          df$rank.Counts,
                          lfcTable.rank$Metric[i], 
                          "TPM", "Counts") + 
ggtitle(paste("Comparison in", lfcTable.rank$Metric[i], "Rank")) 

print(pl)

 
}

Summarizing up/down DEGs with an upset plot

- red bar: input type

- blue bar: directionality of gene expression change

# Set a function cleaning data to generate upset plots 
upset.input.fn <- function(df) {

    # Filter genes with valid padj
    df <- subset(df, !is.na(padj)) %>% 

        
        mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
               
               Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
               
               Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
               
               TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
               
               Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input? 

    # Create a list storing groups of interest
    upsetInput <- list(Up=df$Up, 
                       Down=df$Down, 
                       Unchanged=df$Unchanged, 
                       TPM_Input=df$TPM, 
                       Counts_Input=df$Counts) 

    return(upsetInput)

}

upsetList <- upset.input.fn(both.df)


# Create the upset plot 
upset(fromList(upsetList), 
      sets=names(upsetList),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red","blue", "blue", "blue"),
      text.scale = 1.5, number.angles=30) 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ashr_2.2-47                 apeglm_1.12.0              
##  [3] ensembldb_2.14.0            AnnotationFilter_1.14.0    
##  [5] GenomicFeatures_1.42.1      AnnotationDbi_1.52.0       
##  [7] UpSetR_1.4.0                gridExtra_2.3              
##  [9] pheatmap_1.0.12             DESeq2_1.30.1              
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [13] MatrixGenerics_1.2.0        matrixStats_0.58.0         
## [15] GenomicRanges_1.42.0        GenomeInfoDb_1.26.0        
## [17] IRanges_2.24.1              S4Vectors_0.28.1           
## [19] tximport_1.18.0             forcats_0.5.1              
## [21] stringr_1.4.0               dplyr_1.0.5                
## [23] purrr_0.3.4                 readr_1.4.0                
## [25] tidyr_1.1.3                 tibble_3.1.0               
## [27] ggplot2_3.3.3               tidyverse_1.3.0            
## [29] AnnotationHub_2.22.0        BiocFileCache_1.14.0       
## [31] dbplyr_2.1.0                BiocGenerics_0.36.0        
## [33] rmarkdown_2.7               data.table_1.14.0          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                  backports_1.2.1              
##   [3] plyr_1.8.6                    lazyeval_0.2.2               
##   [5] splines_4.0.3                 BiocParallel_1.24.0          
##   [7] digest_0.6.27                 invgamma_1.1                 
##   [9] htmltools_0.5.1.1             SQUAREM_2021.1               
##  [11] fansi_0.4.2                   magrittr_2.0.1               
##  [13] memoise_2.0.0                 Biostrings_2.58.0            
##  [15] annotate_1.68.0               modelr_0.1.8                 
##  [17] askpass_1.1                   bdsmatrix_1.3-4              
##  [19] prettyunits_1.1.1             colorspace_2.0-0             
##  [21] blob_1.2.1                    rvest_0.3.6                  
##  [23] rappdirs_0.3.3                haven_2.3.1                  
##  [25] xfun_0.21                     crayon_1.4.1                 
##  [27] RCurl_1.98-1.2                jsonlite_1.7.2               
##  [29] genefilter_1.72.1             survival_3.2-7               
##  [31] glue_1.4.2                    gtable_0.3.0                 
##  [33] zlibbioc_1.36.0               XVector_0.30.0               
##  [35] DelayedArray_0.16.0           scales_1.1.1                 
##  [37] mvtnorm_1.1-1                 DBI_1.1.1                    
##  [39] Rcpp_1.0.6                    xtable_1.8-4                 
##  [41] progress_1.2.2                emdbook_1.3.12               
##  [43] bit_4.0.4                     truncnorm_1.0-8              
##  [45] httr_1.4.2                    RColorBrewer_1.1-2           
##  [47] ellipsis_0.3.1                farver_2.0.3                 
##  [49] pkgconfig_2.0.3               XML_3.99-0.5                 
##  [51] sass_0.3.1                    locfit_1.5-9.4               
##  [53] utf8_1.1.4                    labeling_0.4.2               
##  [55] tidyselect_1.1.0              rlang_0.4.10                 
##  [57] later_1.1.0.1                 munsell_0.5.0                
##  [59] BiocVersion_3.12.0            cellranger_1.1.0             
##  [61] tools_4.0.3                   cachem_1.0.4                 
##  [63] cli_2.3.1                     generics_0.1.0               
##  [65] RSQLite_2.2.3                 broom_0.7.5                  
##  [67] evaluate_0.14                 fastmap_1.1.0                
##  [69] yaml_2.2.1                    knitr_1.31                   
##  [71] bit64_4.0.5                   fs_1.5.0                     
##  [73] mime_0.10                     xml2_1.3.2                   
##  [75] biomaRt_2.46.3                compiler_4.0.3               
##  [77] rstudioapi_0.13               curl_4.3                     
##  [79] interactiveDisplayBase_1.28.0 reprex_1.0.0                 
##  [81] geneplotter_1.68.0            bslib_0.2.4                  
##  [83] stringi_1.5.3                 highr_0.8                    
##  [85] ps_1.5.0                      lattice_0.20-41              
##  [87] ProtGenerics_1.22.0           Matrix_1.3-2                 
##  [89] vctrs_0.3.6                   pillar_1.5.0                 
##  [91] lifecycle_1.0.0               BiocManager_1.30.10          
##  [93] jquerylib_0.1.3               irlba_2.3.3                  
##  [95] bitops_1.0-6                  httpuv_1.5.5                 
##  [97] rtracklayer_1.50.0            R6_2.5.0                     
##  [99] promises_1.2.0.1              MASS_7.3-53.1                
## [101] assertthat_0.2.1              openssl_1.4.3                
## [103] withr_2.4.1                   GenomicAlignments_1.26.0     
## [105] Rsamtools_2.6.0               GenomeInfoDbData_1.2.4       
## [107] hms_1.0.0                     grid_4.0.3                   
## [109] coda_0.19-4                   mixsqp_0.3-43                
## [111] bbmle_1.0.23.1                numDeriv_2016.8-1.1          
## [113] shiny_1.6.0                   lubridate_1.7.10